metadata = read.csv("https://github.com/borevitzlab/brachy-genotyping/raw/master/metadata/brachy-metadata.csv")
#str(metadata)
spp = read.csv("data/species_id.csv")
#str(spp)
#snpgdsVCF2GDS("data/2017-11-30_bhybridum_filtered_default.vcf.gz",
# "data/2017-11-30_bhybridum_filtered_default.gds")
geno = snpgdsOpen("data/2017-11-30_bhybridum_filtered_default.gds",
allow.duplicate = T, readonly = T)
samp = snpgdsSummary(geno)$sample.id
## The file name: /home/kevin/ws/brachy-genotyping-notes/data/2017-11-30_bhybridum_filtered_default.gds
## The total number of samples: 1968
## The total number of SNPs: 1278299
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
snp.missfilt = snpgdsSelectSNP(geno, missing.rate=0.999, autosome.only=F)
## Excluding 51,452 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: 0.999)
The total number of SNPs remaining is 1226847.
Directly subset the three species’ genotype matricies.
chrom = read.gdsn(index.gdsn(geno, "snp.chromosome"))[snp.missfilt]
snp.dis = snp.missfilt[grep('Bd21-3', chrom)]
snp.sta = snp.missfilt[grep('Bstacei', chrom)]
snp.all = snp.missfilt
samp.hyb = filter(spp, species.id == "hybridum")$sample
samp.dis = filter(spp, species.id == "distachyon")$sample
samp.sta = filter(spp, species.id == "stacei")$sample
These things are in common for each species of brachy, so turn them into functions.
ssp.filt = function(geno, samps, snps, max.snp.miss.rate=0.99,
max.samp.miss.rate=0.99, min.maf=0.001 ) {
miss.samp = snpgdsSampMissRate(geno, snp.id=snps, sample.id=samps)
hist(miss.samp, breaks=100, main="Sample Missing Data (pre-filt)")
abline(v=max.samp.miss.rate, col="blue", lwd=2)
samps = samps[miss.samp <= max.samp.miss.rate]
srf = snpgdsSNPRateFreq(geno, sample.id=samps, snp.id = snps)
miss.snp = srf$MissingRate
hist(miss.snp, breaks=100, main="SNP missing data")
abline(v=max.snp.miss.rate, col="blue", lwd=2)
maf = srf$MinorFreq
hist(maf, breaks=50, main="SNP MAF")
abline(v=min.maf, lwd=2, col="blue")
snps = snpgdsSelectSNP(geno, sample.id=samps, snp.id=snps, maf=min.maf,
missing.rate=max.snp.miss.rate, autosome.only=F)
miss.samp = snpgdsSampMissRate(geno, snp.id=snps, sample.id=samps)
hist(miss.samp, breaks=100, main="Sample Missing Data (post-filt)")
print(paste("Num SNPs:", length(snps)))
print(paste("Num Samples:", length(samps)))
return(list(snps=snps, samps=samps, miss.samp=miss.samp))
}
ssp.geno = function(geno, filt) {
ibs = snpgdsIBS(geno, sample.id=filt$samps, snp.id=filt$snps,
autosome.only=F, num.thread=4)
ibs.nacnt = rowSums(is.na(ibs$ibs))
table(ibs.nacnt)
return(ibs)
}
distimpute = function(ibs, thresh=5, maxdist = 1) {
dist = 1 - ibs$ibs
nasum = colSums(is.na(dist))
pass = nasum < thresh
dist = dist[pass,pass]
ibs$sample.id = ibs$sample.id[pass]
dist.ut = dist
dist.ut[upper.tri(dist.ut,diag=T)] = 0
nasum = colSums(is.na(dist.ut))
# impute
for (j in which(nasum > 0)) {
for (i in which(is.na(dist[,j]))) {
k = which(dist.ut[,j] == max(dist.ut[,j], na.rm=T))
if (length(k) > 1) {
k = k[1]
}
if (dist[k, j] <= maxdist) {
dist[i, j] = max(dist[k, j], # k, j is neighbour -> self
dist[k, i]) # k, i is neighbour -> other
dist[j, i] = max(dist[k, j], # k, j is neighbour -> self
dist[k, i]) # k, i is neighbour -> other
}
}
}
ibs$ibs = 1 - dist
return(ibs)
}
distimpute2 = function(ibs, max.NAs=0, max.dist = 0.2) {
dist = 1 - ibs$ibs
dist.ut = dist
dist.ut[upper.tri(dist.ut,diag=T)] = 0
nasum = colSums(is.na(dist.ut))
num.imputed = 0
# impute
for (j in which(nasum > 0)) {
for (i in which(is.na(dist[,j]))) {
k = which(dist.ut[,j] == max(dist.ut[,j], na.rm=T))
if (length(k) > 1) {
k = k[1]
}
if (dist[k, j] <= max.dist) {
num.imputed = num.imputed + 1
dist[i, j] = max(dist[k, j], # k, j is neighbour -> self
dist[k, i]) # k, i is neighbour -> other
dist[j, i] = max(dist[k, j], # k, j is neighbour -> self
dist[k, i]) # k, i is neighbour -> other
}
}
}
num.removed = 0
while (sum(is.na(dist)) > 0) {
rm = which.max(colSums(is.na(dist)))
dist = dist[-rm,]
dist = dist[,-rm]
ibs$sample.id = ibs$sample.id[-rm]
num.removed = num.removed + 1
}
ibs$ibs = 1 - dist
ibs$num.imputed = num.imputed
ibs$num.removed = num.removed
print(paste("Num imputed:", num.imputed))
print(paste("Num removed:", num.removed))
return(ibs)
}
samp.rils = metadata %>%
filter(grepl("^RIL", accession)) %>%
select(anon.name) %>%
unlist
samp.dis.sansrils = samp.dis[! samp.dis %in% samp.rils]
filt.dis = ssp.filt(geno, samp.dis.sansrils, snp.dis, min.maf=0.01,
max.samp.miss.rate = 0.995, max.snp.miss.rate=0.95)
## Excluding 751,055 SNPs (monomorphic: TRUE, MAF: 0.01, missing rate: 0.95)
## [1] "Num SNPs: 81400"
## [1] "Num Samples: 514"
ibs.dis = ssp.geno(geno, filt.dis)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 514 samples, 81,400 SNPs
## using 4 (CPU) cores
## IBS: the sum of all selected genotypes (0,1,2) = 6296601
## Thu Dec 14 16:03:36 2017 (internal increment: 30336)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 1s
## Thu Dec 14 16:03:37 2017 Done.
ibs.dis.imp = distimpute2(ibs.dis, max.dist = 0.2)
## [1] "Num imputed: 0"
## [1] "Num removed: 11"
dim(ibs.dis.imp$ibs)
## [1] 503 503
After the distance interpolation, we have interpolated 0 entries, and have removed 11 samples.
Run over a range of z thresholds.
grouping = metadata %>%
select(accession, anon.name)
for (zthresh in c(seq(1, 4, 0.5), 5:10)) {
cut.dis = snpgdsHCluster(ibs.dis.imp) %>%
snpgdsCutTree(outlier.n=0, z.threshold = zthresh)
s = cut.dis$sample.id
grouping = grouping %>%
filter(anon.name %in% cut.dis$sample.id)
d = cut.dis$samp.group[match(grouping$anon.name, s)]
snpgdsDrawTree(cut.dis)
grouping[[paste0("GroupAt", zthresh, "_Ngrps", length(table(d)))]] = d
}
## Determine groups by permutation (Z threshold: 1, outlier threshold: 0):
## Create 298 groups.
## Determine groups by permutation (Z threshold: 1.5, outlier threshold: 0):
## Create 209 groups.
## Determine groups by permutation (Z threshold: 2, outlier threshold: 0):
## Create 165 groups.
## Determine groups by permutation (Z threshold: 2.5, outlier threshold: 0):
## Create 132 groups.
## Determine groups by permutation (Z threshold: 3, outlier threshold: 0):
## Create 104 groups.
## Determine groups by permutation (Z threshold: 3.5, outlier threshold: 0):
## Create 74 groups.
## Determine groups by permutation (Z threshold: 4, outlier threshold: 0):
## Create 60 groups.
## Determine groups by permutation (Z threshold: 5, outlier threshold: 0):
## Create 49 groups.
## Determine groups by permutation (Z threshold: 6, outlier threshold: 0):
## Create 42 groups.
## Determine groups by permutation (Z threshold: 7, outlier threshold: 0):
## Create 37 groups.
## Determine groups by permutation (Z threshold: 8, outlier threshold: 0):
## Create 29 groups.
## Determine groups by permutation (Z threshold: 9, outlier threshold: 0):
## Create 16 groups.
## Determine groups by permutation (Z threshold: 10, outlier threshold: 0):
## Create 16 groups.
write.csv(grouping, "out/group_zscores.csv", row.names = F)
best.z.thresh = 2.5
ibs.dis.plt = ibs.dis.imp
ibs.dis.plt$sample.id = paste(metadata$anon.name[match(ibs.dis.imp$sample.id, metadata$anon.name)],
metadata$accession[match(ibs.dis.imp$sample.id, metadata$anon.name)])
hc.dis.plt = snpgdsHCluster(ibs.dis.plt) %>%
snpgdsCutTree(outlier.n=1, z.threshold = best.z.thresh, label.H = F, label.Z = F)
## Determine groups by permutation (Z threshold: 2.5, outlier threshold: 1):
## Create 131 groups.
snpgdsDrawTree(hc.dis.plt, leaflab="perpendicular")
dodgy = read.csv("data/dodgy-samples.csv")$anon.name
samp.dis.genocall = samp.dis.sansrils[! samp.dis.sansrils %in% dodgy]
filt.dis.geno = ssp.filt(geno, samp.dis.genocall, snp.dis, min.maf=0.01,
max.samp.miss.rate = 0.995, max.snp.miss.rate=0.95)
## Excluding 749,655 SNPs (monomorphic: TRUE, MAF: 0.01, missing rate: 0.95)
## [1] "Num SNPs: 82800"
## [1] "Num Samples: 495"
ibs.dis = ssp.geno(geno, filt.dis.geno)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 495 samples, 82,800 SNPs
## using 4 (CPU) cores
## IBS: the sum of all selected genotypes (0,1,2) = 6233397
## Thu Dec 14 16:04:13 2017 (internal increment: 31488)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 0s
## Thu Dec 14 16:04:13 2017 Done.
ibs.dis.imp = distimpute2(ibs.dis, max.dist = 0.2)
## [1] "Num imputed: 0"
## [1] "Num removed: 5"
dim(ibs.dis.imp$ibs)
## [1] 490 490
best.z.thresh = 2.5
best.z.hc = snpgdsHCluster(ibs.dis.imp) %>%
snpgdsCutTree(outlier.n=1, z.threshold = best.z.thresh, label.Z = F)
## Determine groups by permutation (Z threshold: 2.5, outlier threshold: 1):
## Create 134 groups.
snpgdsDrawTree(best.z.hc)
sort(table(best.z.hc$samp.group), decreasing = T)
##
## G079 G018 G019 G012 G015 G052
## 16 10 10 9 9 9
## G082 G025 G037 G077 G084 G013
## 9 8 8 8 8 7
## G046 G102 G105 G107 G002 G004
## 7 7 7 7 6 6
## G005 G024 G042 G059 G061 G074
## 6 6 6 6 6 6
## G081 G083 G092 G009 G010 G014
## 6 6 6 5 5 5
## G029 G030 G036 G041 G048 G063
## 5 5 5 5 5 5
## G068 G090 G098 G003 G008 G011
## 5 5 5 4 4 4
## G020 G021 G026 G027 G028 G038
## 4 4 4 4 4 4
## G039 G043 G045 G047 G057 G058
## 4 4 4 4 4 4
## G069 G071 G072 G085 G087 G093
## 4 4 4 4 4 4
## G096 G099 G023 G033 G049 G050
## 4 4 3 3 3 3
## G055 G056 G060 G070 G073 G076
## 3 3 3 3 3 3
## G086 G088 G101 G103 G104 G106
## 3 3 3 3 3 3
## G001 G006 G007 G016 G017 G022
## 2 2 2 2 2 2
## G031 G032 G034 G035 G040 G044
## 2 2 2 2 2 2
## G051 G053 G054 G062 G064 G065
## 2 2 2 2 2 2
## G066 G067 G075 G078 G080 G089
## 2 2 2 2 2 2
## G091 G094 G095 G097 G100 Outlier001
## 2 2 2 2 2 1
## Outlier002 Outlier003 Outlier004 Outlier005 Outlier006 Outlier007
## 1 1 1 1 1 1
## Outlier008 Outlier009 Outlier010 Outlier011 Outlier012 Outlier013
## 1 1 1 1 1 1
## Outlier014 Outlier015 Outlier016 Outlier017 Outlier018 Outlier019
## 1 1 1 1 1 1
## Outlier020 Outlier021 Outlier022 Outlier023 Outlier024 Outlier025
## 1 1 1 1 1 1
## Outlier026 Outlier027
## 1 1
goodgroups = data.frame(best.z.hc[c("sample.id", "samp.group")]) %>%
filter(grepl("^G", samp.group))
genocall = metadata %>%
select(accession, anon.name) %>%
right_join(goodgroups, by=c("anon.name"="sample.id"))
## Warning: Column `anon.name`/`sample.id` joining factors with different
## levels, coercing to character vector
genocall$missing.rate = snpgdsSampMissRate(geno, sample.id=as.character(genocall$anon.name),
snp.id = filt.dis$snps)
write.csv(genocall, "out/genotypes.csv", row.names = F)
Now, we take the indiviudal run with the least missing data from each clonal lineage
genocall.best = genocall %>%
group_by(samp.group) %>%
summarise(anon.name = anon.name[which.min(missing.rate)],
accession = accession[which.min(missing.rate)],
missing.rate = min(missing.rate))
write.csv(genocall.best, "out/genotypes_best.csv", row.names = F)
samps = genocall.best$anon.name
snps = filt.dis$snps
max.snp.miss.rate = 0.9
min.maf = 0.05
srf = snpgdsSNPRateFreq(geno, sample.id=samps, snp.id=snps)
miss.snp = srf$MissingRate
hist(miss.snp, breaks=100, main="SNP missing data")
abline(v=max.snp.miss.rate, lwd=2, col="blue")
maf = srf$MinorFreq
hist(maf, breaks=50, main="SNP MAF")
abline(v=min.maf, lwd=2, col="blue")
snps = snpgdsSelectSNP(geno, sample.id=samps, snp.id=snps, maf=min.maf,
missing.rate=max.snp.miss.rate, autosome.only=F)
## Excluding 56,652 SNPs (monomorphic: TRUE, MAF: 0.05, missing rate: 0.9)
srf.post = snpgdsSNPRateFreq(geno, sample.id=samps, snp.id=snps)
hist(srf.post$MissingRate, breaks=100, main="SNP missing data (post)")
print(paste("Num SNPs:", length(snps)))
## [1] "Num SNPs: 24748"
print(paste("Num Samples:", length(samps)))
## [1] "Num Samples: 107"
struct.genos = snpgdsGetGeno(geno, sample.id = samps, snp.id=snps)
## Genotype matrix: 107 samples X 24748 SNPs
hist(colSums(struct.genos, na.rm = T))
table(colSums(struct.genos, na.rm = T)==2)
##
## FALSE TRUE
## 24420 328
hist(colSums(is.na(struct.genos))/nrow(struct.genos))
rownames(struct.genos) = samps
struct.genos[struct.genos==1] = NA
write.table(struct.genos, "out/structure_genos_bdis.txt", na="-9",
sep="\t", quote = F, col.names = F)
ibs.dis.plt = ibs.dis.imp
ibs.dis.plt$sample.id = paste(metadata$anon.name[match(ibs.dis.imp$sample.id, metadata$anon.name)],
metadata$accession[match(ibs.dis.imp$sample.id, metadata$anon.name)])
hc.dis.plt = snpgdsHCluster(ibs.dis.plt) %>%
snpgdsCutTree(outlier.n=1, z.threshold = best.z.thresh, label.H = F, label.Z = F)
## Determine groups by permutation (Z threshold: 2.5, outlier threshold: 1):
## Create 133 groups.
snpgdsDrawTree(hc.dis.plt, leaflab="perpendicular")
filt.hyb = ssp.filt(geno, samp.hyb, snp.all, min.maf=0.01,
max.samp.miss.rate = 0.99, max.snp.miss.rate=0.97)
## Excluding 1,122,425 SNPs (monomorphic: TRUE, MAF: 0.01, missing rate: 0.97)
## [1] "Num SNPs: 104422"
## [1] "Num Samples: 692"
ibs.hyb = ssp.geno(geno, filt.hyb)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 692 samples, 104,422 SNPs
## using 4 (CPU) cores
## IBS: the sum of all selected genotypes (0,1,2) = 8083459
## Thu Dec 14 16:04:26 2017 (internal increment: 22528)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 2s
## Thu Dec 14 16:04:28 2017 Done.
ibs.hyb.imp = distimpute2(ibs.hyb, max.dist = 0.2)
## [1] "Num imputed: 0"
## [1] "Num removed: 13"
table(colSums(is.na(ibs.hyb.imp$ibs)))
##
## 0
## 679
snpgdsHCluster(ibs.hyb.imp) %>%
snpgdsCutTree() %>%
snpgdsDrawTree(outlier.n = 0)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 23 groups.
filt.sta = ssp.filt(geno, samp.sta, snp.sta, min.maf=0.01,
max.samp.miss.rate = 0.99, max.snp.miss.rate=0.97)
## Excluding 334,637 SNPs (monomorphic: TRUE, MAF: 0.01, missing rate: 0.97)
## [1] "Num SNPs: 59755"
## [1] "Num Samples: 56"
ibs.sta = ssp.geno(geno, filt.sta)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 56 samples, 59,755 SNPs
## using 4 (CPU) cores
## IBS: the sum of all selected genotypes (0,1,2) = 328781
## Thu Dec 14 16:04:32 2017 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 0s
## Thu Dec 14 16:04:32 2017 Done.
ibs.sta.imp = distimpute2(ibs.hyb, max.dist = 0.2)
## [1] "Num imputed: 0"
## [1] "Num removed: 13"
table(colSums(is.na(ibs.sta.imp$ibs)))
##
## 0
## 679
snpgdsHCluster(ibs.sta.imp) %>%
snpgdsCutTree(outlier.n=0) %>%
snpgdsDrawTree()
## Determine groups by permutation (Z threshold: 15, outlier threshold: 0):
## Create 23 groups.